setwd("F:\\1529407184_Assignment")
data <- load("KBO1.rda")
data <- KBO1

str(data)

data$ln_y <- log(data$Y)

scatterplotMatrix(~ X1+X2+X3+X4+X5+X6+X7, data = data)

data$team <- NULL
data$player <- NULL

# user written function for creating descriptive statistics
mystats <- function(x) {
  nmiss<-sum(is.na(x))
  a <- x[!is.na(x)]
  m <- mean(a)
  n <- length(a)
  s <- sd(a)
  min <- min(a)
  p1<-quantile(a,0.01)
  p5<-quantile(a,0.05)
  p10<-quantile(a,0.10)
  q1<-quantile(a,0.25)
  q2<-quantile(a,0.5)
  q3<-quantile(a,0.75)
  p90<-quantile(a,0.90)
  p95<-quantile(a,0.95)
  p99<-quantile(a,0.99)
  max <- max(a)
  UC <- m+3*s
  LC <- m-3*s
  outlier_flag<- max>UC | min<LC
  return(c(n=n, nmiss=nmiss, outlier_flag=outlier_flag, mean=m, stdev=s,min = min, p1=p1,p5=p5,p10=p10,q1=q1,q2=q2,q3=q3,p90=p90,p95=p95,p99=p99,max=max, UC=UC, LC=LC ))
}

vars <- c('Y','X1','X2','X3','X4','X5','X6','X7','ln_y')

diag_stats<-t(data.frame(apply(data[vars], 2, mystats)))
View(diag_stats)

#Outlier Capping
data$Y[data$Y > 63900] <- 63900
data$X2[data$X2 > 19.39] <-19.39
data$X3[data$X3 > 87.95] <-87.95
data$X6[data$X6 > 1709.9] <- 1709.9
data$X7[data$X7 > 17.78] <- 17.78

#Splitting data into Training, Validaton and Testing Dataset
train_ind <- sample(1:nrow(data), size = floor(0.70 * nrow(data)))

training <- data[train_ind,]
testing <- data[-train_ind,]

#Builidng model for training dataset
fit <- lm(Y~X1+X2+X3+X4+X5+X6+X7, data = training)
summary(fit)

anova(fit)


fit1 <- step(lm(Y~X1+X2+X3+X4+X5+X6+X7, data = training), direction = "both")
fit2 <- step(lm(Y~X1+X2+X3+X4+X5+X6+X7, data = training), direction = "forward")
fit3 <- step(lm(Y~X1+X2+X3+X4+X5+X6+X7, data = training), direction = "backward")

summary(fit1)
summary(fit2)
summary(fit3)

anova(fit1)
anova(fit2)
anova(fit3)
